Here we do exploratory data analysis on HDMA data obtained for Pennsylvania in the year 2017. We will start from looking at the data superficially and then diving into columns of interest. Then we see for any missing values and handle them. Lets get started with the steps. ## Global setup like working directory, data directory etc should happen here.
library(sys)
working_directory <- getwd()
setwd(dirname(dirname(working_directory)))
The working directory was changed to /Users/omkarpawar/Desktop/csp-571-02-final-project/src inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
writeLines("")
getwd()
[1] "/Users/omkarpawar/Desktop/csp-571-02-final-project/src"
data_dir <- "/Users/omkarpawar/Desktop/Data/PA/"
# https://stackoverflow.com/questions/4090169/elegant-way-to-check-for-missing-packages-and-install-them
list_of_packages <- c("mlbench", "corrplot", "rvest", "tidyr", "stringr", "dplyr", "lubridate", "data.table", "mice", "scales", "naniar", "rpart", "rpart.plot", "caret")
new.packages <- list_of_packages[!(list_of_packages %in% installed.packages()[,"Package"])]
if (length(new.packages)) {
print("Installing packages\n")
install.packages(new.packages())
}
library(corrplot)
library(ggplot2)
library(tidyr)
library(stringr)
library(dplyr)
library(data.table)
library(mice)
library(rstudioapi)
library(naniar)
source(paste(dirname(dirname(dirname(rstudioapi::getActiveDocumentContext()$path))), "utils/utils.r", sep="/"))
hmda_data_pa <- fread(paste(data_dir, "hmda_2017_pa_all-records_labels.csv", sep = ""))
|--------------------------------------------------|
|==================================================|
Lets see first few rows of our data and what they tell about the application.
hmda_data_pa_df <- as.data.frame(hmda_data_pa)
# Filter to include conventional loans only.
hmda_data_pa_df <- hmda_data_pa_df[hmda_data_pa_df$loan_type == "1", ]
colnames(hmda_data_pa_df)
[1] "as_of_year" "respondent_id"
[3] "agency_name" "agency_abbr"
[5] "agency_code" "loan_type_name"
[7] "loan_type" "property_type_name"
[9] "property_type" "loan_purpose_name"
[11] "loan_purpose" "owner_occupancy_name"
[13] "owner_occupancy" "loan_amount_000s"
[15] "preapproval_name" "preapproval"
[17] "action_taken_name" "action_taken"
[19] "msamd_name" "msamd"
[21] "state_name" "state_abbr"
[23] "state_code" "county_name"
[25] "county_code" "census_tract_number"
[27] "applicant_ethnicity_name" "applicant_ethnicity"
[29] "co_applicant_ethnicity_name" "co_applicant_ethnicity"
[31] "applicant_race_name_1" "applicant_race_1"
[33] "applicant_race_name_2" "applicant_race_2"
[35] "applicant_race_name_3" "applicant_race_3"
[37] "applicant_race_name_4" "applicant_race_4"
[39] "applicant_race_name_5" "applicant_race_5"
[41] "co_applicant_race_name_1" "co_applicant_race_1"
[43] "co_applicant_race_name_2" "co_applicant_race_2"
[45] "co_applicant_race_name_3" "co_applicant_race_3"
[47] "co_applicant_race_name_4" "co_applicant_race_4"
[49] "co_applicant_race_name_5" "co_applicant_race_5"
[51] "applicant_sex_name" "applicant_sex"
[53] "co_applicant_sex_name" "co_applicant_sex"
[55] "applicant_income_000s" "purchaser_type_name"
[57] "purchaser_type" "denial_reason_name_1"
[59] "denial_reason_1" "denial_reason_name_2"
[61] "denial_reason_2" "denial_reason_name_3"
[63] "denial_reason_3" "rate_spread"
[65] "hoepa_status_name" "hoepa_status"
[67] "lien_status_name" "lien_status"
[69] "edit_status_name" "edit_status"
[71] "sequence_number" "population"
[73] "minority_population" "hud_median_family_income"
[75] "tract_to_msamd_income" "number_of_owner_occupied_units"
[77] "number_of_1_to_4_family_units" "application_date_indicator"
writeLines("")
head(hmda_data_pa_df, 10)
NA
dim(hmda_data_pa_df)
[1] 333431 78
writeLines("Glimpse of hmda dataset for PA")
Glimpse of hmda dataset for PA
glimpse(hmda_data_pa_df)
Observations: 333,431
Variables: 78
$ as_of_year [3m[38;5;246m<int>[39m[23m 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, …
$ respondent_id [3m[38;5;246m<chr>[39m[23m "75-2921540", "36-4327855", "62-1532940", "0000480228", "75-…
$ agency_name [3m[38;5;246m<chr>[39m[23m "Department of Housing and Urban Development", "Department o…
$ agency_abbr [3m[38;5;246m<chr>[39m[23m "HUD", "HUD", "HUD", "CFPB", "HUD", "NCUA", "CFPB", "HUD", "…
$ agency_code [3m[38;5;246m<int>[39m[23m 7, 7, 7, 9, 7, 5, 9, 7, 7, 9, 9, 7, 5, 7, 7, 5, 7, 7, 9, 7, …
$ loan_type_name [3m[38;5;246m<chr>[39m[23m "Conventional", "Conventional", "Conventional", "Conventiona…
$ loan_type [3m[38;5;246m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ property_type_name [3m[38;5;246m<chr>[39m[23m "One-to-four family dwelling (other than manufactured housin…
$ property_type [3m[38;5;246m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ loan_purpose_name [3m[38;5;246m<chr>[39m[23m "Refinancing", "Home purchase", "Home purchase", "Refinancin…
$ loan_purpose [3m[38;5;246m<int>[39m[23m 3, 1, 1, 3, 3, 3, 1, 1, 1, 2, 2, 3, 3, 1, 1, 3, 3, 3, 2, 3, …
$ owner_occupancy_name [3m[38;5;246m<chr>[39m[23m "Owner-occupied as a principal dwelling", "Owner-occupied as…
$ owner_occupancy [3m[38;5;246m<int>[39m[23m 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, …
$ loan_amount_000s [3m[38;5;246m<int>[39m[23m 53, 416, 218, 70, 128, 153, 162, 344, 268, 18, 5, 310, 28, 2…
$ preapproval_name [3m[38;5;246m<chr>[39m[23m "Not applicable", "Preapproval was not requested", "Preappro…
$ preapproval [3m[38;5;246m<int>[39m[23m 3, 2, 2, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
$ action_taken_name [3m[38;5;246m<chr>[39m[23m "Application withdrawn by applicant", "Loan originated", "Lo…
$ action_taken [3m[38;5;246m<int>[39m[23m 4, 1, 1, 3, 4, 4, 1, 6, 1, 3, 3, 1, 3, 1, 1, 1, 4, 1, 3, 4, …
$ msamd_name [3m[38;5;246m<chr>[39m[23m "Philadelphia - PA", "Philadelphia - PA", "Montgomery County…
$ msamd [3m[38;5;246m<int>[39m[23m 37964, 37964, 33874, 37964, 10900, 37964, 14100, 37964, 3387…
$ state_name [3m[38;5;246m<chr>[39m[23m "Pennsylvania", "Pennsylvania", "Pennsylvania", "Pennsylvani…
$ state_abbr [3m[38;5;246m<chr>[39m[23m "PA", "PA", "PA", "PA", "PA", "PA", "PA", "PA", "PA", "PA", …
$ state_code [3m[38;5;246m<int>[39m[23m 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, …
$ county_name [3m[38;5;246m<chr>[39m[23m "Philadelphia County", "Philadelphia County", "Montgomery Co…
$ county_code [3m[38;5;246m<int>[39m[23m 101, 101, 91, 101, 77, 101, 37, 101, 91, 129, 101, 91, 101, …
$ census_tract_number [3m[38;5;246m<dbl>[39m[23m 173.00, 160.00, 2007.03, 173.00, 92.00, 55.00, 506.00, 160.0…
$ applicant_ethnicity_name [3m[38;5;246m<chr>[39m[23m "Not Hispanic or Latino", "Not Hispanic or Latino", "Not His…
$ applicant_ethnicity [3m[38;5;246m<int>[39m[23m 2, 2, 2, 3, 2, 2, 2, 3, 2, 2, 2, 2, 3, 2, 3, 2, 3, 3, 3, 2, …
$ co_applicant_ethnicity_name [3m[38;5;246m<chr>[39m[23m "No co-applicant", "Information not provided by applicant in…
$ co_applicant_ethnicity [3m[38;5;246m<int>[39m[23m 5, 3, 5, 5, 5, 2, 5, 5, 5, 2, 5, 2, 5, 5, 5, 2, 5, 5, 5, 5, …
$ applicant_race_name_1 [3m[38;5;246m<chr>[39m[23m "Black or African American", "White", "White", "Information …
$ applicant_race_1 [3m[38;5;246m<int>[39m[23m 3, 5, 5, 6, 5, 3, 5, 6, 6, 5, 5, 5, 6, 5, 3, 5, 6, 6, 6, 3, …
$ applicant_race_name_2 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ applicant_race_2 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ applicant_race_name_3 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ applicant_race_3 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ applicant_race_name_4 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ applicant_race_4 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ applicant_race_name_5 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ applicant_race_5 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ co_applicant_race_name_1 [3m[38;5;246m<chr>[39m[23m "No co-applicant", "Information not provided by applicant in…
$ co_applicant_race_1 [3m[38;5;246m<int>[39m[23m 8, 6, 8, 8, 8, 3, 8, 8, 8, 3, 8, 3, 8, 8, 8, 5, 8, 8, 8, 8, …
$ co_applicant_race_name_2 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ co_applicant_race_2 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ co_applicant_race_name_3 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ co_applicant_race_3 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ co_applicant_race_name_4 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ co_applicant_race_4 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ co_applicant_race_name_5 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ co_applicant_race_5 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ applicant_sex_name [3m[38;5;246m<chr>[39m[23m "Male", "Female", "Male", "Information not provided by appli…
$ applicant_sex [3m[38;5;246m<int>[39m[23m 1, 2, 1, 3, 2, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 3, 3, 2, 1, …
$ co_applicant_sex_name [3m[38;5;246m<chr>[39m[23m "No co-applicant", "Information not provided by applicant in…
$ co_applicant_sex [3m[38;5;246m<int>[39m[23m 5, 3, 5, 5, 5, 2, 5, 5, 5, 2, 5, 1, 5, 5, 5, 2, 5, 5, 5, 5, …
$ applicant_income_000s [3m[38;5;246m<int>[39m[23m 12, 118, 53, 62, 84, 64, 92, 141, 90, 27, 72, 116, 45, 48, N…
$ purchaser_type_name [3m[38;5;246m<chr>[39m[23m "Loan was not originated or was not sold in calendar year co…
$ purchaser_type [3m[38;5;246m<int>[39m[23m 0, 7, 0, 0, 0, 0, 1, 3, 6, 0, 0, 7, 0, 6, 0, 0, 0, 3, 0, 0, …
$ denial_reason_name_1 [3m[38;5;246m<chr>[39m[23m "", "", "", "Debt-to-income ratio", "", "", "", "", "", "Cre…
$ denial_reason_1 [3m[38;5;246m<int>[39m[23m NA, NA, NA, 1, NA, NA, NA, NA, NA, 3, 1, NA, NA, NA, NA, NA,…
$ denial_reason_name_2 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "Credit history", ""…
$ denial_reason_2 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 3, NA, NA, NA, NA, N…
$ denial_reason_name_3 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ denial_reason_3 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ rate_spread [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ hoepa_status_name [3m[38;5;246m<chr>[39m[23m "Not a HOEPA loan", "Not a HOEPA loan", "Not a HOEPA loan", …
$ hoepa_status [3m[38;5;246m<int>[39m[23m 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
$ lien_status_name [3m[38;5;246m<chr>[39m[23m "Secured by a first lien", "Secured by a first lien", "Secur…
$ lien_status [3m[38;5;246m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 4, 1, 3, 3, 1, 2, 1, 1, 1, 1, 1, 3, 1, …
$ edit_status_name [3m[38;5;246m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ edit_status [3m[38;5;246m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ sequence_number [3m[38;5;246m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ population [3m[38;5;246m<int>[39m[23m 3202, 6711, 4262, 3202, 3870, 6385, 5401, 6711, 4262, 2986, …
$ minority_population [3m[38;5;246m<dbl>[39m[23m 97.28, 12.86, 32.50, 97.28, 17.52, 90.60, 7.87, 12.86, 32.50…
$ hud_median_family_income [3m[38;5;246m<int>[39m[23m 57400, 57400, 102600, 57400, 75200, 57400, 62400, 57400, 102…
$ tract_to_msamd_income [3m[38;5;246m<dbl>[39m[23m 47.54, 92.76, 100.13, 47.54, 92.06, 101.79, 63.22, 92.76, 10…
$ number_of_owner_occupied_units [3m[38;5;246m<int>[39m[23m 710, 1702, 1221, 710, 997, 1595, 1221, 1702, 1221, 1049, 170…
$ number_of_1_to_4_family_units [3m[38;5;246m<int>[39m[23m 1314, 3063, 1299, 1314, 1460, 2475, 2334, 3063, 1299, 1446, …
$ application_date_indicator [3m[38;5;246m<lgl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
Now, lets look at the missing values that are present in our data. We go through this in 4 steps. First we look for any NAs, then empty string, NULL values and at last we look for missing values encoded as “?”
writeLines("Checking for missing values with NA")
Checking for missing values with NA
sapply(hmda_data_pa_df, function(x) sum(is.na(x)))
as_of_year respondent_id agency_name
0 0 0
agency_abbr agency_code loan_type_name
0 0 0
loan_type property_type_name property_type
0 0 0
loan_purpose_name loan_purpose owner_occupancy_name
0 0 0
owner_occupancy loan_amount_000s preapproval_name
0 0 0
preapproval action_taken_name action_taken
0 0 0
msamd_name msamd state_name
0 27970 0
state_abbr state_code county_name
0 0 0
county_code census_tract_number applicant_ethnicity_name
367 673 0
applicant_ethnicity co_applicant_ethnicity_name co_applicant_ethnicity
0 0 0
applicant_race_name_1 applicant_race_1 applicant_race_name_2
0 0 0
applicant_race_2 applicant_race_name_3 applicant_race_3
331952 0 333327
applicant_race_name_4 applicant_race_4 applicant_race_name_5
0 333409 0
applicant_race_5 co_applicant_race_name_1 co_applicant_race_1
333412 0 0
co_applicant_race_name_2 co_applicant_race_2 co_applicant_race_name_3
0 332993 0
co_applicant_race_3 co_applicant_race_name_4 co_applicant_race_4
333404 0 333425
co_applicant_race_name_5 co_applicant_race_5 applicant_sex_name
0 333425 0
applicant_sex co_applicant_sex_name co_applicant_sex
0 0 0
applicant_income_000s purchaser_type_name purchaser_type
20983 0 0
denial_reason_name_1 denial_reason_1 denial_reason_name_2
0 292570 0
denial_reason_2 denial_reason_name_3 denial_reason_3
324810 0 332373
rate_spread hoepa_status_name hoepa_status
326995 0 0
lien_status_name lien_status edit_status_name
0 0 333431
edit_status sequence_number population
333431 333431 673
minority_population hud_median_family_income tract_to_msamd_income
673 673 673
number_of_owner_occupied_units number_of_1_to_4_family_units application_date_indicator
673 673 333431
writeLines("Checking for missing values with empty strings")
Checking for missing values with empty strings
sapply(hmda_data_pa_df, function(x) sum(x == ""))
as_of_year respondent_id agency_name
0 0 0
agency_abbr agency_code loan_type_name
0 0 0
loan_type property_type_name property_type
0 0 0
loan_purpose_name loan_purpose owner_occupancy_name
0 0 0
owner_occupancy loan_amount_000s preapproval_name
0 0 0
preapproval action_taken_name action_taken
0 0 0
msamd_name msamd state_name
28020 NA 0
state_abbr state_code county_name
0 0 367
county_code census_tract_number applicant_ethnicity_name
NA NA 0
applicant_ethnicity co_applicant_ethnicity_name co_applicant_ethnicity
0 0 0
applicant_race_name_1 applicant_race_1 applicant_race_name_2
0 0 331952
applicant_race_2 applicant_race_name_3 applicant_race_3
NA 333327 NA
applicant_race_name_4 applicant_race_4 applicant_race_name_5
333409 NA 333412
applicant_race_5 co_applicant_race_name_1 co_applicant_race_1
NA 0 0
co_applicant_race_name_2 co_applicant_race_2 co_applicant_race_name_3
332993 NA 333404
co_applicant_race_3 co_applicant_race_name_4 co_applicant_race_4
NA 333425 NA
co_applicant_race_name_5 co_applicant_race_5 applicant_sex_name
333425 NA 0
applicant_sex co_applicant_sex_name co_applicant_sex
0 0 0
applicant_income_000s purchaser_type_name purchaser_type
NA 0 0
denial_reason_name_1 denial_reason_1 denial_reason_name_2
292570 NA 324810
denial_reason_2 denial_reason_name_3 denial_reason_3
NA 332373 NA
rate_spread hoepa_status_name hoepa_status
NA 0 0
lien_status_name lien_status edit_status_name
0 0 NA
edit_status sequence_number population
NA NA NA
minority_population hud_median_family_income tract_to_msamd_income
NA NA NA
number_of_owner_occupied_units number_of_1_to_4_family_units application_date_indicator
NA NA NA
writeLines("Checking for missing values with ?")
Checking for missing values with ?
sapply(hmda_data_pa_df, function(x) sum(x == "?"))
as_of_year respondent_id agency_name
0 0 0
agency_abbr agency_code loan_type_name
0 0 0
loan_type property_type_name property_type
0 0 0
loan_purpose_name loan_purpose owner_occupancy_name
0 0 0
owner_occupancy loan_amount_000s preapproval_name
0 0 0
preapproval action_taken_name action_taken
0 0 0
msamd_name msamd state_name
0 NA 0
state_abbr state_code county_name
0 0 0
county_code census_tract_number applicant_ethnicity_name
NA NA 0
applicant_ethnicity co_applicant_ethnicity_name co_applicant_ethnicity
0 0 0
applicant_race_name_1 applicant_race_1 applicant_race_name_2
0 0 0
applicant_race_2 applicant_race_name_3 applicant_race_3
NA 0 NA
applicant_race_name_4 applicant_race_4 applicant_race_name_5
0 NA 0
applicant_race_5 co_applicant_race_name_1 co_applicant_race_1
NA 0 0
co_applicant_race_name_2 co_applicant_race_2 co_applicant_race_name_3
0 NA 0
co_applicant_race_3 co_applicant_race_name_4 co_applicant_race_4
NA 0 NA
co_applicant_race_name_5 co_applicant_race_5 applicant_sex_name
0 NA 0
applicant_sex co_applicant_sex_name co_applicant_sex
0 0 0
applicant_income_000s purchaser_type_name purchaser_type
NA 0 0
denial_reason_name_1 denial_reason_1 denial_reason_name_2
0 NA 0
denial_reason_2 denial_reason_name_3 denial_reason_3
NA 0 NA
rate_spread hoepa_status_name hoepa_status
NA 0 0
lien_status_name lien_status edit_status_name
0 0 NA
edit_status sequence_number population
NA NA NA
minority_population hud_median_family_income tract_to_msamd_income
NA NA NA
number_of_owner_occupied_units number_of_1_to_4_family_units application_date_indicator
NA NA NA
writeLines("Checking for missing values with null")
Checking for missing values with null
sapply(hmda_data_pa_df, function(x) sum(x == NULL))
as_of_year respondent_id agency_name
0 0 0
agency_abbr agency_code loan_type_name
0 0 0
loan_type property_type_name property_type
0 0 0
loan_purpose_name loan_purpose owner_occupancy_name
0 0 0
owner_occupancy loan_amount_000s preapproval_name
0 0 0
preapproval action_taken_name action_taken
0 0 0
msamd_name msamd state_name
0 0 0
state_abbr state_code county_name
0 0 0
county_code census_tract_number applicant_ethnicity_name
0 0 0
applicant_ethnicity co_applicant_ethnicity_name co_applicant_ethnicity
0 0 0
applicant_race_name_1 applicant_race_1 applicant_race_name_2
0 0 0
applicant_race_2 applicant_race_name_3 applicant_race_3
0 0 0
applicant_race_name_4 applicant_race_4 applicant_race_name_5
0 0 0
applicant_race_5 co_applicant_race_name_1 co_applicant_race_1
0 0 0
co_applicant_race_name_2 co_applicant_race_2 co_applicant_race_name_3
0 0 0
co_applicant_race_3 co_applicant_race_name_4 co_applicant_race_4
0 0 0
co_applicant_race_name_5 co_applicant_race_5 applicant_sex_name
0 0 0
applicant_sex co_applicant_sex_name co_applicant_sex
0 0 0
applicant_income_000s purchaser_type_name purchaser_type
0 0 0
denial_reason_name_1 denial_reason_1 denial_reason_name_2
0 0 0
denial_reason_2 denial_reason_name_3 denial_reason_3
0 0 0
rate_spread hoepa_status_name hoepa_status
0 0 0
lien_status_name lien_status edit_status_name
0 0 0
edit_status sequence_number population
0 0 0
minority_population hud_median_family_income tract_to_msamd_income
0 0 0
number_of_owner_occupied_units number_of_1_to_4_family_units application_date_indicator
0 0 0
First, we look at race and ethnicity columns and see what information they provide and how is the distribution per variable.
library(janitor)
writeLines("")
writeLines("Application ethnicity values")
Application ethnicity values
unique(hmda_data_pa_df$applicant_ethnicity_name)
[1] "Not Hispanic or Latino"
[2] "Information not provided by applicant in mail, Internet, or telephone application"
[3] "Hispanic or Latino"
[4] "Not applicable"
writeLines("")
writeLines("Application race name 1 values")
Application race name 1 values
unique(hmda_data_pa_df$applicant_race_1)
[1] 3 5 6 2 4 7 1
unique(hmda_data_pa_df$applicant_race_name_1)
[1] "Black or African American"
[2] "White"
[3] "Information not provided by applicant in mail, Internet, or telephone application"
[4] "Asian"
[5] "Native Hawaiian or Other Pacific Islander"
[6] "Not applicable"
[7] "American Indian or Alaska Native"
Now, lets group the dataframe by ethnicity not Hispanic and print the count according to race.
grouped_by_race_info <- hmda_data_pa_df %>% filter(applicant_ethnicity_name == "Hispanic or Latino") %>%
group_by(applicant_race_name_1) %>%
count() %>%
ungroup() %>%
replace(is.na(.), 0) %>%
adorn_totals(c("col")) %>%
arrange(-Total)
head(grouped_by_race_info)
applicant_race_name_1 n Total
White 8038 8038
Information not provided by applicant in mail, Internet, or telephone application 1363 1363
Black or African American 504 504
American Indian or Alaska Native 315 315
Native Hawaiian or Other Pacific Islander 231 231
Asian 101 101
We do this because we want to merge these two columns into one and deal with it as one single predictor.
hmda_data_pa_df$applicant_race_and_ethnicity <- NA
hmda_data_pa_df$co_applicant_race_and_ethnicity <- NA
hmda_data_pa_df$applicant_race_and_ethnicity <- ifelse(hmda_data_pa_df$applicant_ethnicity_name == "Hispanic or Latino",
"Hispanic or Latino", hmda_data_pa_df$applicant_race_name_1)
hmda_data_pa_df$co_applicant_race_and_ethnicity <- ifelse(hmda_data_pa_df$co_applicant_ethnicity_name == "Hispanic or Latino",
"Hispanic or Latino", hmda_data_pa_df$co_applicant_race_name_1)
writeLines("")
writeLines("Unique values for the applicant_race_and_ethnicity column")
Unique values for the applicant_race_and_ethnicity column
writeLines("")
unique(hmda_data_pa_df$applicant_race_and_ethnicity)
[1] "Black or African American"
[2] "White"
[3] "Information not provided by applicant in mail, Internet, or telephone application"
[4] "Asian"
[5] "Native Hawaiian or Other Pacific Islander"
[6] "Hispanic or Latino"
[7] "Not applicable"
[8] "American Indian or Alaska Native"
head(hmda_data_pa_df)
NA
See how the distroibution is for the loan application according to race and ethnicity. We summarise the count of application according to the applicants race.
mortgage_by_race_and_ethnicity = hmda_data_pa_df %>% group_by(applicant_race_and_ethnicity) %>%
summarise(EthnicityCount = n()) %>%
arrange(desc(EthnicityCount))
graph_by_enthicity(mortgage_by_race_and_ethnicity)
THe barchart shows that there are more applications made by White popuplation, which is justifiable as the US has majority of White population. Now, lets dive even deeper and see how the actions are taken for application for each race and ethnicity category.
mortgage_status_by_race_and_ethnicity <- hmda_data_pa_df %>% group_by(action_taken_name, applicant_race_and_ethnicity) %>%
summarise(ActionCount = n()) %>%
arrange(desc(ActionCount))
mortgage_status_aggregated_by_race_and_ethnicity = inner_join(mortgage_status_by_race_and_ethnicity, mortgage_by_race_and_ethnicity) %>% mutate(percentage = (ActionCount / EthnicityCount) * 100)
Joining, by = "applicant_race_and_ethnicity"
graph_application_race_proportion_of_loans(mortgage_status_aggregated_by_race_and_ethnicity)
The graph above clearly shows that the denial rate is more for minorities, and to be more specific, it is more for African Americans. One more thing to notice is that the category where applicants race is unknown, most of them are purchased by the institution.
hmda_origination_status_df <- hmda_data_pa_df[hmda_data_pa_df$action_taken == "1", ]
graph_applicant_income_histogram(hmda_origination_status_df, "Applicant income distribution for originated loans")
Now lets see how the income distriubtion underlies for applicants. Lets see the median income for each category.
hmda_origination_status_df <- hmda_data_pa_df[hmda_data_pa_df$action_taken == "1", ]
head(hmda_origination_status_df)
hmda_origination_status_df %>% ggplot(aes(as.numeric(hud_median_family_income))) +
geom_histogram(binwidth = 1000,, fill=c("blue")) + labs(x = "Median Income", y = "Applicant Count", title = "Median Income Distribution for Area for Originated Loans") + theme_bw()
mortgage_distribution_by_counties <- hmda_data_pa_df %>%
filter(!is.na(county_name)) %>%
group_by(county_name) %>%
summarise(CountLoans = n() ) %>%
mutate(percentage = ( CountLoans / sum(CountLoans) ) * 100 ) %>%
mutate(county_name = reorder(county_name, percentage)) %>%
arrange(desc(percentage)) %>%
head(20)
graph_distribution_by_county(mortgage_distribution_by_counties)
originated_mortgage_distribution_by_counties <- hmda_origination_status_df %>%
filter(!is.na(county_name)) %>%
group_by(county_name) %>%
summarise(CountLoans = n() ) %>%
mutate(percentage = ( CountLoans / sum(CountLoans) ) *100 ) %>%
mutate(county_name = reorder(county_name, percentage)) %>%
arrange(desc(percentage)) %>%
head(20)
graph_distribution_by_county(originated_mortgage_distribution_by_counties)
county_names <- c("Allegheny County", "Philadelphia County", "Montgomery County", "Bucks County")
for (county_name in county_names) {
hmda_data_county_df <- hmda_data_pa_df[hmda_data_pa_df$county_name == county_name, ]
mortgage_by_race_county <- hmda_data_county_df %>% group_by(applicant_race_name_1) %>%
summarise(RaceCount = n()) %>% arrange(desc(RaceCount))
print(graph_mortgage_distribution_by_race1(mortgage_by_race_county))
}
for (county_name in county_names) {
hmda_origination_status_df_by_county_white <- hmda_data_pa_df[hmda_data_pa_df$action_taken == "1" & hmda_data_pa_df$county_name == county_name & hmda_data_pa_df$applicant_race_name_1 == "White", ]
print(graph_applicant_income_histogram(hmda_origination_status_df_by_county_white, "Income distribution for Whites"))
hmda_origination_status_df_by_county_african_american <- hmda_data_pa_df[hmda_data_pa_df$action_taken == "1" & hmda_data_pa_df$county_name == county_name & hmda_data_pa_df$applicant_race_name_1 == "Black or African American", ]
print(graph_applicant_income_histogram(hmda_origination_status_df_by_county_african_american, "Income distribution for African Americans"))
}
county_names <- c("Allegheny County", "Philadelphia County", "Montgomery County", "Bucks County")
for (county_name in county_names) {
hmda_data_county_df <- hmda_data_pa_df[hmda_data_pa_df$county_name == county_name, ]
mortgage_by_race_county <- hmda_data_county_df %>% group_by(applicant_race_and_ethnicity) %>%
summarise(RaceCount = n()) %>% arrange(desc(RaceCount))
print(graph_mortgage_distribution_by_race_and_ethnicity(mortgage_by_race_county))
}
for (county_name in county_names) {
hmda_data_county_df <- hmda_data_pa_df[hmda_data_pa_df$county_name == county_name, ]
mortgage_by_race1_county <- hmda_data_county_df %>% group_by(applicant_race_and_ethnicity) %>%
summarise(RaceCount = n()) %>% arrange(desc(RaceCount))
mortgage_status_by_race1_by_county <- hmda_data_county_df %>% group_by(action_taken_name, applicant_race_and_ethnicity) %>%
summarise(ActionCount = n()) %>%
arrange(desc(ActionCount))
mortgage_status_aggregated_by_race1_by_county = inner_join(mortgage_status_by_race1_by_county, mortgage_by_race1_county) %>% mutate(percentage = (ActionCount / RaceCount) * 100)
print(graph_application_race_and_ethnicity_proportion_of_loans(mortgage_status_aggregated_by_race1_by_county))
}
Joining, by = "applicant_race_and_ethnicity"
visualize_missing_values(hmda_data_pa_df)
In this graph, we see the missing value count for each column and for each category too. There are alot of missing in some columns like co applicant and applicant 2-3-4 race.
Now we try to impute the missing values. Easy way out here is to impute it with mice function. Its not the best but initially we go with this and see how it performs. # Impute as needed.
# https://www.rdocumentation.org/packages/mice/versions/3.8.0/topics/mice.impute.cart
hmda_data_pa_df_imputed <- mice(hmda_data_pa_df, m=1, maxit=2, meth='cart',seed=500)
hmda_data_pa_df_imputed <- mice::complete(hmda_data_pa_df_imputed)
summary(hmda_data_pa_df_imputed)
gg_miss_upset(hmda_data_pa_df_imputed)
# https://stackoverflow.com/questions/20637360/convert-all-data-frame-character-columns-to-factors
hmda_data_pa_df$loan_to_income_ratio <- hmda_data_pa_df$loan_amount_000s / hmda_data_pa_df$applicant_income_000s
hmda_data_pa_df[sapply(hmda_data_pa_df, is.character)] <- lapply(hmda_data_pa_df[sapply(hmda_data_pa_df, is.character)],
as.factor)
hmda_data_pa_df_for_correlation <- as.data.frame(lapply(hmda_data_pa_df, as.integer))
#head(hmda_data_pa_df_for_correlation[, c("applicant_income_000s", "loan_amount_000s")])
head(hmda_data_pa_df_for_correlation)
corr_simple(hmda_data_pa_df_for_correlation)
corrplot(cor(hmda_data_pa_df_for_correlation[, c("applicant_income_000s", "loan_amount_000s")], use = "na.or.complete"))
# hmda_data_pa_df_imputed <- hmda_data_pa_df;
# https://stackoverflow.com/questions/20637360/convert-all-data-frame-character-columns-to-factors
hmda_data_pa_df_imputed$loan_to_income_ratio <- hmda_data_pa_df_imputed$loan_amount_000s / hmda_data_pa_df_imputed$applicant_income_000s
hmda_data_pa_df_imputed[sapply(hmda_data_pa_df_imputed, is.character)] <- lapply(hmda_data_pa_df_imputed[sapply(hmda_data_pa_df_imputed, is.character)],
as.factor)
hmda_data_pa_df_imputed_for_correlation <- as.data.frame(lapply(hmda_data_pa_df_imputed, as.integer))
head(hmda_data_pa_df_imputed_for_correlation[, c("applicant_income_000s", "loan_amount_000s")])
corr_simple(hmda_data_pa_df_imputed_for_correlation)
corrplot(cor(hmda_data_pa_df_imputed_for_correlation[, c("applicant_income_000s", "loan_amount_000s")], use = "na.or.complete"))
hmda_model_df <- hmda_data_frame_for_model(hmda_data_pa_df_imputed)
hmda_model_df <- process_model_df_columns(hmda_model_df)
l <- ggplot(hmda_model_df, aes(applicant_race_and_ethnicity,fill = loan_granted))
l <- l + geom_histogram(stat="count") + coord_flip()
print(l)
l <- ggplot(hmda_model_df, aes(loan_purpose, fill = loan_granted))
l <- l + geom_histogram(stat="count") + coord_flip()
print(l)
plot(hmda_model_df$loan_granted, main="Loan granted Variable",
col=colors()[100:102],
xlab="Loan distribution")
skew <- paste("Skewness:",skewness(hmda_model_df$loan_amount_000s,na.rm = TRUE))
ggplot(data = hmda_model_df , aes(x = loan_amount_000s)) + geom_histogram(fill = "steelblue") + labs(title = "Loan amount distribution" , x = "Loan amount in thousands" , y = "Count")+ annotate("text", x = 100000, y = 300000, size = 3.2,label = skew)
Looks like the data is highly skewed.
#install.packages("moments")
library(moments)
skewness(hmda_model_df$loan_amount_000s,na.rm = TRUE)
The data for loan amount is highly right skewed. Changes should be made so that the prediction model does not mess up.
skew <- paste("Skewness:",skewness(log(hmda_model_df$loan_amount_000s),na.rm = TRUE))
ggplot(data = hmda_model_df , aes(x = log(loan_amount_000s))) + geom_histogram(fill = "steelblue") + labs(title = "Log transformed distribution for Loan amount" , x = "log(Loan Amount)", y = 'Count')+ annotate("text", x = 8, y = 100000, size = 3.2,label = skew)
skewness(log(hmda_model_df$loan_amount_000s),na.rm = TRUE)
boxplot(log(hmda_model_df$loan_amount_000s),col = colors()[100:109],
main = "Boxplot of Log of Loan Amounts",
xlab="Loan Amount",
ylab="Distribution of Log of Loan Amounts")
skew <- paste("Skewness:",skewness(hmda_model_df$applicant_income_000s,na.rm = TRUE))
ggplot(data = hmda_model_df , aes(x = applicant_income_000s)) + geom_histogram(fill = "steelblue") + labs(title = "Applicant Income distribution" , x = "Applicant Income in thousands" , y = "Count") + annotate("text", x = 100000, y = 90000, size = 3.2,label = skew)
skew <- paste("Skewness:",skewness(log(hmda_model_df$applicant_income_000s),na.rm=TRUE))
ggplot(data = hmda_model_df , aes(x = log(applicant_income_000s))) + geom_histogram(fill = "steelblue") + labs(title = "Log transformed distribution for Applicant Income" , x = "log(Applicant Income)", y = 'Count') +annotate("text", x = 10, y = 90000, size = 3.2,label = skew)
boxplot(log(loan_amount_000s)~loan_granted, xlab="Loan decision",ylab="Log of Loan Amounts",col=c("pink","lightblue"),
main="Exploratory Data Analysis Plot\n of Loan Decision Versus Log of Loan Amounts", data = hmda_model_df)
boxplot(log(applicant_income_000s)~loan_granted, xlab="Loan decision",ylab="Log of Applicant Income",col=c("pink","lightblue"),
main="Exploratory Data Analysis Plot\n of Loan Decision Versus Log of Applicant Income", data = hmda_model_df)
ggplot(hmda_model_df, aes(log(applicant_income_000s), applicant_race_and_ethnicity, color = loan_granted)) +
geom_jitter() +
ggtitle("Log of Applicant income vs. Applicant race and ethnicity , by color = Loan decision") +
theme_light()
ggplot(hmda_model_df, aes(log(loan_amount_000s), applicant_race_and_ethnicity, color = loan_granted)) +
geom_jitter() +
ggtitle("Log of loan amount vs. Applicant race and ethnicity , by color = Loan decision") +
theme_light()
ggplot(hmda_model_df, aes(loan_to_income_ratio, applicant_race_and_ethnicity, color = loan_granted)) +
geom_jitter() +
ggtitle("Loan to Income ratio vs. Applicant race and ethnicity , by color = Loan decision") +
theme_light()
write.csv(hmda_data_pa_df_imputed, paste(data_dir, "/2017/hmda_2017_pa_imputed.csv", sep = ""), row.names = FALSE)